{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fa4efc0-e2c2-428b-8209-7c5b98b4cf44",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os, time, datetime, ast, json\n",
    "from itertools import product\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.dates as mdates\n",
    "%matplotlib inline\n",
    "\n",
    "import geopandas as gpd\n",
    "import geopy.distance as geodist\n",
    "from shapely.ops import nearest_points\n",
    "\n",
    "os.chdir(\"/Users/xiaosongw/Dropbox/Research/InformedSources/Replication/Build\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e44253bd-ed53-4c15-8d53-955d3db2c217",
   "metadata": {},
   "source": [
    "# find stations in Melbourne"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8bb6a37-4f8f-40e4-9798-adcc67627cfc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# price data\n",
    "df_vic = pd.read_csv(\"./Output/is_vic_2005_2019_raw_merged.csv\", low_memory=False)\n",
    "# google map cleaned addresses\n",
    "df_coor = pd.read_csv(\"./Output/is_vic_gmap_clean_2005_2019.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a002ebc-f592-4e52-931a-dd04910e2fb8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ucl shapefile\n",
    "gdf0_ucl = gpd.read_file(\"./Input/1270055004_ucl_2016_aust_shape\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92655aaa-f4e1-4e3a-90a2-ed5eabc38edb",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_vic['id'] = df_vic['sitecode'].map(df_coor.set_index('sitecode')['sitecode_new'])\n",
    "df_vic['coor'] = df_vic['sitecode'].map(df_coor.set_index('sitecode')['coor'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5de129b-2ec7-4419-a487-7976577b8fa4",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_vic[['sitecode', 'id', 'coor']].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3247391-2df1-43fc-a0ca-c490497ee6e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_is_st_vic = df_vic.loc[df_vic['coor'].notnull(), ['id', 'coor']].drop_duplicates()\n",
    "print('number of stations in vic', df_is_st_vic.shape)\n",
    "df_is_st_vic['lat'] = df_is_st_vic['coor'].apply(lambda x: ast.literal_eval(x)[0])\n",
    "df_is_st_vic['lng'] = df_is_st_vic['coor'].apply(lambda x: ast.literal_eval(x)[1])\n",
    "gdf_is_st_vic = gpd.GeoDataFrame(df_is_st_vic, \n",
    "            geometry=gpd.points_from_xy(df_is_st_vic.lng, df_is_st_vic.lat), crs=\"EPSG:4283\").copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "073b545f-ac47-4687-b838-0953ee3e0774",
   "metadata": {},
   "outputs": [],
   "source": [
    "gdf_is_st_vic['mel'] = (gdf_is_st_vic['geometry'].within(\n",
    "    gdf0_ucl.loc[gdf0_ucl['UCL_NAME16']=='Melbourne', 'geometry'].values[0])).astype(int)\n",
    "df_vic['mel'] = df_vic['id'].map(gdf_is_st_vic.set_index('id')['mel'].to_dict())\n",
    "l_sid_mel =gdf_is_st_vic.loc[gdf_is_st_vic['mel']==1, 'id'].unique().tolist()\n",
    "print(len(l_sid_mel))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90e49e08-691c-47ad-b75a-4bb61efef254",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add additional stations located near Melbourne\n",
    "# include stations located within 2km from the Melboure UCL border\n",
    "from shapely.ops import nearest_points\n",
    "gdf_is_st_nonmel = gdf_is_st_vic[gdf_is_st_vic['mel']==0].copy()\n",
    "gdf_is_st_nonmel['poly_pnt'] = gdf_is_st_nonmel['geometry'].apply(\n",
    "    lambda x: nearest_points(\n",
    "        gdf0_ucl.loc[gdf0_ucl['UCL_NAME16']=='Melbourne', 'geometry'].values[0], x)[0])\n",
    "gdf_is_st_nonmel['ucl_coor'] = gdf_is_st_nonmel['poly_pnt'].apply(lambda z: (z.y, z.x))\n",
    "gdf_is_st_nonmel['dist_km'] = gdf_is_st_nonmel[['coor', 'ucl_coor']].apply(\n",
    "    lambda x: geodist.distance(ast.literal_eval(x[0]), x[1]).km, axis=1)\n",
    "gdf_is_st_nonmel['mel'] = (gdf_is_st_nonmel['dist_km']<2).astype(int)\n",
    "l_sid_mel = l_sid_mel + gdf_is_st_nonmel.loc[gdf_is_st_nonmel['mel']==1, 'id'].unique().tolist()\n",
    "print('{} stations in Melbourne in the full sample'.format(len(l_sid_mel)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e03c9ebb-96a8-443b-8814-ae9efb1ce1bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out = df_vic[['id', 'date', 'brand', 'sitename', 'address1', 'suburb', 'postcode', \n",
    "                 'avgprice', 'collectionmethod', 'bid', 'coor', 'mel']].sort_values(\n",
    "    ['id', 'date'], ignore_index=True).copy()\n",
    "df_out.loc[df_out['id'].isin(l_sid_mel), 'mel'] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fd02b20-af4b-4756-aec6-d4552262f234",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out[df_out['mel']==1].groupby('date')['avgprice'].count().plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b76b32c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out.to_csv(\"./Output/is_vic_p_2005_2019.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70f1267e-28ee-4be0-887a-52e562999696",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
